Supply Chain Analysis¶

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In [2]:
df = pd.read_csv('supply_chain_data.csv')
In [3]:
df['Product type'] = df['Product type'].replace({
    'skincare': 'credit card',
    'haircare': 'charge card',
    'cosmetics': 'co-branded card'
})

0. Dataset Exploration¶

0.1 Dataset Overview¶

In [4]:
df.head()
Out[4]:
Product type SKU Price Availability Number of products sold Revenue generated Customer demographics Stock levels Lead times Order quantities ... Location Lead time Production volumes Manufacturing lead time Manufacturing costs Inspection results Defect rates Transportation modes Routes Costs
0 charge card SKU0 69.808006 55 802 8661.996792 Non-binary 58 7 96 ... Mumbai 29 215 29 46.279879 Pending 0.226410 Road Route B 187.752075
1 credit card SKU1 14.843523 95 736 7460.900065 Female 53 30 37 ... Mumbai 23 517 30 33.616769 Pending 4.854068 Road Route B 503.065579
2 charge card SKU2 11.319683 34 8 9577.749626 Unknown 1 10 88 ... Mumbai 12 971 27 30.688019 Pending 4.580593 Air Route C 141.920282
3 credit card SKU3 61.163343 68 83 7766.836426 Non-binary 23 13 59 ... Kolkata 24 937 18 35.624741 Fail 4.746649 Rail Route A 254.776159
4 credit card SKU4 4.805496 26 871 2686.505152 Non-binary 5 3 56 ... Delhi 5 414 3 92.065161 Fail 3.145580 Air Route A 923.440632

5 rows × 24 columns

0.2 Key Statistics Summary¶

In [5]:
df.describe()
Out[5]:
Price Availability Number of products sold Revenue generated Stock levels Lead times Order quantities Shipping times Shipping costs Lead time Production volumes Manufacturing lead time Manufacturing costs Defect rates Costs
count 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.00000 100.000000 100.000000 100.000000
mean 49.462461 48.400000 460.990000 5776.048187 47.770000 15.960000 49.220000 5.750000 5.548149 17.080000 567.840000 14.77000 47.266693 2.277158 529.245782
std 31.168193 30.743317 303.780074 2732.841744 31.369372 8.785801 26.784429 2.724283 2.651376 8.846251 263.046861 8.91243 28.982841 1.461366 258.301696
min 1.699976 1.000000 8.000000 1061.618523 0.000000 1.000000 1.000000 1.000000 1.013487 1.000000 104.000000 1.00000 1.085069 0.018608 103.916248
25% 19.597823 22.750000 184.250000 2812.847151 16.750000 8.000000 26.000000 3.750000 3.540248 10.000000 352.000000 7.00000 22.983299 1.009650 318.778455
50% 51.239831 43.500000 392.500000 6006.352023 47.500000 17.000000 52.000000 6.000000 5.320534 18.000000 568.500000 14.00000 45.905622 2.141863 520.430444
75% 77.198228 75.000000 704.250000 8253.976921 73.000000 24.000000 71.250000 8.000000 7.601695 25.000000 797.000000 23.00000 68.621026 3.563995 763.078231
max 99.171329 100.000000 996.000000 9866.465458 100.000000 30.000000 96.000000 10.000000 9.929816 30.000000 985.000000 30.00000 99.466109 4.939255 997.413450

1. Production Phase¶

1.1 Inventory vs Demand¶

In [6]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=df['SKU'], y=df['Availability'], mode='lines', name='Availability', line=dict(color='blue')),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=df['SKU'], y=df['Number of products sold'], mode='lines', name='Number of products sold', line=dict(color='green')),
    secondary_y=True,
)

fig.update_layout(
    title=dict(text='Availability and Number of Products Sold by SKU', x=0.5),
    xaxis_title='SKU',
    yaxis_title='Availability',
    yaxis2_title='Number of Products Sold',
    font=dict(family="Arial", size=14),
    # legend_title_text='Metric',
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5),
    width=1070,  
    height=600   
)

fig.show()
In [7]:
fig_stock_order = make_subplots(specs=[[{"secondary_y": True}]])

fig_stock_order.add_trace(
    go.Scatter(x=df['SKU'], y=df['Stock levels'], mode='lines', name='Stock levels', line=dict(color='blue')),
    secondary_y=False,
)

fig_stock_order.add_trace(
    go.Scatter(x=df['SKU'], y=df['Order quantities'], mode='lines', name='Order quantities', line=dict(color='green')),
    secondary_y=True,
)

fig_stock_order.update_layout(
    title=dict(text='Stock Levels and Order Quantities by SKU', x=0.5),
    xaxis_title='SKU',
    yaxis_title='Stock Levels',
    yaxis2_title='Order Quantities',
    font=dict(family="Arial", size=14),
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5),
    width=1070, 
    height=600  
)

fig_stock_order.show()

1.2 Quality Control¶

In [8]:
defect_rates_by_product = df.groupby("Product type")['Defect rates'].mean().reset_index()

fig = px.bar(defect_rates_by_product, x='Product type', y='Defect rates', title='Defect Rates by Product Type')

color_scale = px.colors.sequential.Turbo 
fig.update_traces(marker_color=color_scale)

fig.update_layout(
    xaxis_title="Product Type",
    yaxis_title="Mean Defect Rates",
    xaxis=dict(categoryorder='total descending'),
    yaxis=dict(title='Mean Defect Rates'),
    plot_bgcolor='white',
    title_x=0.5,
    showlegend=True
)

fig.show()
In [9]:
pivot_table = pd.pivot_table(df, values='Defect rates', 
                             index=['Transportation modes'], 
                             aggfunc='mean')

transportation_chart = px.pie(values=pivot_table["Defect rates"], 
                              names=pivot_table.index, 
                              title='Defect Rates by Transportation Mode',
                              hole=0.5,
                              color_discrete_sequence=px.colors.sequential.Turbo)
transportation_chart.update_layout(title={'text': 'Defect Rates by Transportation Mode', 'x': 0.5})

transportation_chart.show()

2. Shipping Phase¶

2.1 Cost of Transportation¶

In [10]:
shipping_cost_revenue = df.groupby(['Shipping carriers'])[['Shipping costs', 'Revenue generated']].sum().reset_index()

shipping_cost_revenue_sorted_cost = shipping_cost_revenue.sort_values(by='Shipping costs', ascending=False)

shipping_cost_revenue_sorted_revenue = shipping_cost_revenue.sort_values(by='Revenue generated', ascending=False)

fig = make_subplots(rows=1, cols=2, subplot_titles=("Total Shipping Cost by Shipping Carrier", "Total Revenue by Shipping Carrier"))

fig.add_trace(
    go.Bar(x=shipping_cost_revenue_sorted_cost['Shipping carriers'], y=shipping_cost_revenue_sorted_cost['Shipping costs'],
           name='Shipping Cost', marker_color=px.colors.sequential.Blues),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=shipping_cost_revenue_sorted_revenue['Shipping carriers'], y=shipping_cost_revenue_sorted_revenue['Revenue generated'],
           name='Revenue Generated', marker_color=px.colors.sequential.Greens),
    row=1, col=2
)

fig.update_layout(
    title=dict(text='Shipping Cost and Revenue by Shipping Carrier', x=0.5),
    font=dict(family="Arial", size=14),
    showlegend=False 
)

fig.update_xaxes(title_text="Shipping Carrier", tickangle=-45, row=1, col=1)
fig.update_xaxes(title_text="Shipping Carrier", tickangle=-45, row=1, col=2)
fig.update_yaxes(title_text="Total Shipping Cost", row=1, col=1)
fig.update_yaxes(title_text="Total Revenue", row=1, col=2)

fig.show()

2.2 Shipment Efficiency Analysis¶

In [11]:
avg_lead_times_transport = df.groupby(['Transportation modes'])['Lead times'].mean().reset_index()
avg_lead_times_transport = avg_lead_times_transport.sort_values(by='Lead times', ascending=False)
avg_lead_times_transport['Lead times'] = avg_lead_times_transport['Lead times'].round(2)

avg_shipping_times_transport = df.groupby(['Transportation modes'])['Shipping times'].mean().reset_index()
avg_shipping_times_transport = avg_shipping_times_transport.sort_values(by='Shipping times', ascending=False)
avg_shipping_times_transport['Shipping times'] = avg_shipping_times_transport['Shipping times'].round(2)

fig = make_subplots(rows=1, cols=2, subplot_titles=("Average Lead Times by Transportation Mode", "Average Shipping Times by Transportation Mode"))

fig.add_trace(
    go.Bar(x=avg_lead_times_transport['Lead times'], y=avg_lead_times_transport['Transportation modes'], 
           orientation='h', name='Lead Times', marker=dict(color=avg_lead_times_transport['Lead times'], colorscale='Blues'), 
           text=avg_lead_times_transport['Lead times'], textposition='auto'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=avg_shipping_times_transport['Shipping times'], y=avg_shipping_times_transport['Transportation modes'], 
           orientation='h', name='Shipping Times', marker=dict(color=avg_shipping_times_transport['Shipping times'], colorscale='Blues'), 
           text=avg_shipping_times_transport['Shipping times'], textposition='auto'),
    row=1, col=2
)

fig.update_layout(
    width=1070,  
    height=600,  
    title_text="Average Lead and Shipping Times by Transportation Mode",  # Overall title
    title_x=0.5,
    showlegend=False,  
    font=dict(family="Arial", size=14),
)

fig.update_xaxes(title_text="Average Lead Time", row=1, col=1)
fig.update_yaxes(title_text="Transportation Mode", row=1, col=1)
fig.update_xaxes(title_text="Average Shipping Time", row=1, col=2)
fig.update_yaxes(title_text="Transportation Mode", row=1, col=2)

fig.show()
In [12]:
avg_lead_times_carrier = df.groupby(['Shipping carriers'])['Lead times'].mean().reset_index()
avg_lead_times_carrier = avg_lead_times_carrier.sort_values(by='Lead times', ascending=False)
avg_lead_times_carrier['Lead times'] = avg_lead_times_carrier['Lead times'].round(2)

avg_shipping_times_carrier = df.groupby(['Shipping carriers'])['Shipping times'].mean().reset_index()
avg_shipping_times_carrier = avg_shipping_times_carrier.sort_values(by='Shipping times', ascending=False)
avg_shipping_times_carrier['Shipping times'] = avg_shipping_times_carrier['Shipping times'].round(2)

fig = make_subplots(rows=1, cols=2, subplot_titles=("Average Lead Times by Shipping Carrier", "Average Shipping Times by Shipping Carrier"))

fig.add_trace(
    go.Bar(x=avg_lead_times_carrier['Lead times'], y=avg_lead_times_carrier['Shipping carriers'], 
           orientation='h', name='Lead Times', marker=dict(color=avg_lead_times_carrier['Lead times'], colorscale='Blues'), 
           text=avg_lead_times_carrier['Lead times'], textposition='auto'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=avg_shipping_times_carrier['Shipping times'], y=avg_shipping_times_carrier['Shipping carriers'], 
           orientation='h', name='Shipping Times', marker=dict(color=avg_shipping_times_carrier['Shipping times'], colorscale='Blues'), 
           text=avg_shipping_times_carrier['Shipping times'], textposition='auto'),
    row=1, col=2
)

fig.update_layout(
    width=1070,  
    height=600,  
    title_text="Average Lead and Shipping Times by Shipping Carrier",  # Overall title
    title_x=0.5,
    showlegend=False,  
    font=dict(family="Arial", size=14),
)

fig.update_xaxes(title_text="Average Lead Time", row=1, col=1)
fig.update_yaxes(title_text="Shipping Carrier", row=1, col=1)
fig.update_xaxes(title_text="Average Shipping Time", row=1, col=2)
fig.update_yaxes(title_text="Shipping Carrier", row=1, col=2)

fig.show()
In [13]:
df['Order_Stock_Ratio'] = df['Stock levels'] / df['Order quantities']
df_sub = df[df['Order_Stock_Ratio'] <= 15]

fig = px.scatter(df_sub, x='Order_Stock_Ratio', y='Lead times', 
                 labels={'Order_Stock_Ratio': 'Stock to Order Ratio', 'Lead times': 'Lead Time'},
                 title='How Stock to Order Ratio is related to Lead Time',
                 trendline='lowess') 

fig.update_layout(
    xaxis_title="Stock to Order Ratio",
    yaxis_title="Lead Time",
    font=dict(family="Arial", size=14),
    title_x=0.5
)

fig.show()

3. Customer Segmentation¶

In [14]:
revenue_avg_by_demo_prod = df.groupby(['Customer demographics', 'Product type'])['Revenue generated'].mean().reset_index()
revenue_sum_by_demo_prod = df.groupby(['Customer demographics', 'Product type'])['Revenue generated'].sum().reset_index()

colors = px.colors.sequential.deep

fig = make_subplots(rows=1, cols=2, subplot_titles=('Average Revenue', 'Total Revenue'))

for i, product_type in enumerate(revenue_avg_by_demo_prod['Product type'].unique()):
    subset = revenue_avg_by_demo_prod[revenue_avg_by_demo_prod['Product type'] == product_type]
    fig.add_trace(
        go.Bar(
            x=subset['Customer demographics'],
            y=subset['Revenue generated'],
            name=product_type,
            marker_color=colors[i]
        ),
        row=1, col=1
    )

for i, product_type in enumerate(revenue_sum_by_demo_prod['Product type'].unique()):
    subset = revenue_sum_by_demo_prod[revenue_sum_by_demo_prod['Product type'] == product_type]
    fig.add_trace(
        go.Bar(
            x=subset['Customer demographics'],
            y=subset['Revenue generated'],
            showlegend=False,
            marker_color=colors[i]
        ),
        row=1, col=2
    )

fig.update_layout(
    title='Revenue Analysis by Customer Demographics and Product Type',
    xaxis=dict(title='Customer Demographics'),
    yaxis=dict(title='Revenue'),
    xaxis2=dict(title='Customer Demographics'),
    yaxis2=dict(title='Revenue'),
    title_x=0.5,
    showlegend=True  
)

fig.show()

4. Sales Performance¶

In [15]:
sales_data = df.groupby('Product type')['Number of products sold'].sum().reset_index()

pie_chart = px.pie(sales_data, values='Number of products sold', names='Product type', 
                   title='Sales by Product Type', 
                   hover_data=['Number of products sold'],
                   hole=0.5,
                   color_discrete_sequence=px.colors.sequential.Turbo)
                   
pie_chart.update_traces(textposition='inside', textinfo='percent+label')
pie_chart.update_layout(title={'text': 'Sales by Product Type', 'x': 0.5})
pie_chart.show()

5. LightGBM Model: Demand Prediction¶

5.1 Prediction vs. Actual Value¶

In [16]:
from sklearn.preprocessing import LabelEncoder
df_transformed = df.copy()

label_encoders = {}
for column in df_transformed.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_transformed[column] = label_encoders[column].fit_transform(df_transformed[column])
In [17]:
X = df_transformed.loc[:,['Product type', 'SKU', 'Price', 'Availability',
        'Revenue generated', 'Customer demographics',
       'Stock levels', 'Lead times', 'Order quantities', 'Shipping times',
       'Shipping carriers', 'Shipping costs', 'Supplier name', 'Location',
       'Lead time', 'Production volumes', 'Manufacturing lead time',
       'Manufacturing costs', 'Inspection results', 'Defect rates',
       'Transportation modes', 'Routes', 'Costs']]

y = df_transformed.loc[:,'Number of products sold']

num_folds = 10

mse_scores = []
rmse_scores = []
mae_scores = []
r2_scores = []

actual_values = np.zeros(len(y))
predicted_values = np.zeros(len(y))

from sklearn.model_selection import KFold

kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

params = {
    'objective': 'regression',
    'metric': 'mean_squared_error',
    'boosting_type': 'gbdt',
    'num_leaves': 62,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'early_stopping_round': 5,
    'verbose' : -1
}
In [18]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    num_round = 100  
    early_stopping_rounds = 5  

    bst = lgb.train(
        params,
        train_data,
        num_round,
        valid_sets=[test_data]
    )
    
    y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
    actual_values[test_index] = y_test
    predicted_values[test_index] = y_pred

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    target_range = np.max(y_test) - np.min(y_test)

    percentage_mse = (mse / target_range) * 100
    percentage_rmse = (rmse / target_range) * 100
    percentage_mae = (mae / target_range) * 100
    percentage_r2 = (r2 * 100)

    mse_scores.append(percentage_mse)
    rmse_scores.append(percentage_rmse)
    mae_scores.append(percentage_mae)
    r2_scores.append(percentage_r2)
In [19]:
plot_df = pd.DataFrame({
    'SKU': df['SKU'],
    'Actual': actual_values,
    'Predicted': predicted_values
})

plot_df = plot_df.sort_values(by='SKU')

fig = go.Figure()

fig.add_trace(go.Scatter(x=plot_df['SKU'], y=plot_df['Actual'], mode='lines', name='Actual'))

fig.add_trace(go.Scatter(x=plot_df['SKU'], y=plot_df['Predicted'], mode='lines', name='Predicted'))

fig.update_layout(
    title='Actual vs Predicted Number of Products Sold by SKU',
    xaxis_title='SKU',
    yaxis_title='Number of Products Sold',
    font=dict(family="Arial", size=14),
    title_x=0.5
)

fig.show()

5.2 Model Performance Metrics¶

In [20]:
avg_mse = np.mean(mse_scores)
avg_rmse = np.mean(rmse_scores)
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)

print(f"Average Mean Squared Error: {avg_mse:.2f}%")
print(f"Average Root Mean Squared Error: {avg_rmse:.2f}%")
print(f"Average Mean Absolute Error: {avg_mae:.2f}%")
print(f"Average R-squared: {avg_r2:.2f}%")
Average Mean Squared Error: 10389.38%
Average Root Mean Squared Error: 34.65%
Average Mean Absolute Error: 31.02%
Average R-squared: -4.38%

5.3 Other Models' Performance¶

In [21]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

data = pd.read_csv('supply_chain_data.csv')

data.columns = data.columns.str.strip()

numerical_cols = ['Price', 'Availability', 'Revenue generated', 'Stock levels',
                  'Lead times', 'Order quantities', 'Shipping times', 'Shipping costs',
                  'Lead time', 'Production volumes', 'Manufacturing lead time', 
                  'Manufacturing costs', 'Defect rates', 'Costs']

categorical_cols = ['Product type', 'SKU', 'Customer demographics', 'Shipping carriers', 
                    'Supplier name', 'Location', 'Inspection results', 
                    'Transportation modes', 'Routes']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

X = data.drop('Number of products sold', axis=1)
y = data['Number of products sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

'''# Predictions and evaluation
target_range = np.max(y_test) - np.min(y_test)
percentage_mse = (mse / target_range) * 100
percentage_rmse = (rmse / target_range) * 100
percentage_mae = (mae / target_range) * 100
percentage_r2 = (r2 * 100)'''

models = {
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

def evaluate_model(model, X_train, X_test, y_train, y_test):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    target_range = np.max(y_test) - np.min(y_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    percentage_mse = (mse / target_range) * 100
    percentage_rmse = (rmse / target_range) * 100
    percentage_mae = (mae / target_range) * 100
    percentage_r2 = (r2 * 100)
    return mse, rmse, mae, r2

results = {}
for model_name, model in models.items():
    results[model_name] = evaluate_model(model, X_train, X_test, y_train, y_test)

results_df = pd.DataFrame(results, index=['MAE', 'MSE', 'RMSE', 'R2']).T
results_df
Out[21]:
MAE MSE RMSE R2
Linear Regression 198499.007684 445.532275 399.193096 -1.080015
Gradient Boosting 165178.180824 406.421186 353.680217 -0.730855
XGBoost 191217.040688 437.283707 378.533047 -1.003709
Random Forest 158605.649115 398.253247 357.678500 -0.661984

5.4 Limitation and Improvement¶

Potential Reasons for Poor Performance¶

  • Lack of Samples: 100 records may not generate statistically significant predictions.
  • Inadequate Feature Engineering: Important features might be missing or not well-represented.
  • Suboptimal Hyperparameters: The current model parameters may not be well-tuned.
  • Insufficient Model Complexity: The model might not be complex enough to capture underlying patterns.

Potential Ways to Improve Accuracy¶

  • Increase Sample Size
    • Look into each individual orders if possible to gain a deeper insight
  • Feature Engineering:

    • Collecting more features that are more relevant to demand level to capture underlying patterns.
    • Perform feature selection to include the most relevant features.
  • Hyperparameter Tuning:

    • Use grid search or random search to find optimal hyperparameters.
    • Experiment with different values for num_leaves, learning_rate, max_depth, etc.